home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Celestin Apprentice 4
/
Apprentice-Release4.iso
/
Source Code
/
Libraries
/
DCLAP 4j
/
DBio
/
DSeqFile.cpp
< prev
next >
Wrap
Text File
|
1995-12-17
|
11KB
|
455 lines
// DSeqFile.cp
// d.g.gilbert
#include <ncbi.h>
#include <dgg.h>
#include <stdio.h>
#include <DFile.h>
#include "DSeqFile.h"
#include "DSeqList.h"
#include "ureadseq.h"
Global char* gOutputName = NULL;
Boolean DSeqFile::gWriteMasks = true;
static char* kMaskName = "#Mask";
#if NEED_LATER
// must be in scope of a DSeqFile:: method
Local const struct formatTable {
char *name;
short num;
} formname[] = {
{"ig", kIG},
{"stanford", kIG},
{"genbank", kGenBank},
{"gb", kGenBank},
{"nbrf", kNBRF},
{"embl", kEMBL},
{"gcg", kGCG},
{"uwgcg", kGCG},
{"dnastrider", kStrider},
{"strider", kStrider},
{"fitch", kFitch},
{"pearson", kPearson},
{"fasta", kPearson},
{"zuker", kZuker},
{"olsen", kOlsen},
{"phylip", kPhylip},
{"phylip3.2", kPhylip2},
{"phylip3.3", kPhylip3},
{"phylip3.4", kPhylip4},
{"phylip-interleaved", kPhylip4},
{"phylip-sequential", kPhylip2},
{"plain", kPlain},
{"raw", kPlain},
{"pir", kPIR},
{"codata", kPIR},
{"asn.1", kASN1},
{"msf", kMSF},
{"paup", kPAUP},
{"nexus", kPAUP},
{"pretty", kPretty},
{0,0}
};
#endif
static char *gSeqFileFormats[] = { //DSeqFile::kMaxFormat+1
"IG|Stanford",
"GenBank|GB",
"NBRF",
"EMBL",
"GCG",
"DNAStrider",
"Fitch",
"Pearson|Fasta",
"Zuker [in only]",
"Olsen [in only]",
"Phylip3.2|Phylip2",
"Phylip|Phylip4",
"Plain|Raw",
"PIR|CODATA",
"MSF",
"ASN.1",
"PAUP|NEXUS",
"Pretty [out only]",
0};
const char* DSeqFile::FormatName(short index)
{
if (index<1 || index>18) return "";
else return gSeqFileFormats[index-1];
}
short DSeqFile::FormatFromName(const char* name)
{
if (!name) return kUnknown;
long i, len1, namelen= StrLen(name);
char *form1, *form2;
if (namelen<2) return kUnknown;
for (i=0; i<kMaxFormat; i++) {
form1= (char*) gSeqFileFormats[i];
form2= StrChr(form1,'|');
if (form2) { len1= form2 - form1; form2++; }
else len1= StrLen( form1);
#if 1
if (StrNICmp(name, form1,len1) == 0) return i+1;
else if (form2 && StrICmp(name, form2) == 0) return i+1;
#else
// problem w/ Phylip matching Phylip3.2 not Phylip
if (StrNICmp(name, form1, namelen) == 0) return i+1;
else if (form2 && StrNICmp(name, form2, namelen) == 0) return i+1;
#endif
}
return kUnknown;
}
char* DSeqFile::FixSeqID( char* s, short leftmargin)
{
#if 1
register char * cp, * ep;
for (cp= s; *cp && isspace(*cp); cp++) ;
for (ep= cp; *ep && !isspace(*ep); ep++) ;
if (ep>cp && ep[-1] == ',') ep--; // fix for fasta "name, #bases, # checksum"
if (*ep) *ep= 0;
return cp;
#else
//s= StrDup(s);
short len = StrLen(s);
while (s[--len] == ' ' && len>0) ;
s[++len]= 0;
short i = Min( len-1, leftmargin);
while (s[i] == ' ' && i<len) i++;
if (i>0) s += i; //MemMove(s, s+i, len-i+1); //s= s+i;
//if (StrLen(s) > kNameWidth) s[kNameWidth]= 0;
return s;
#endif
}
short DSeqFile::SeqFileFormatWrapper( DFile* aFile)
{
long skiplines = 0;
short aFormat, err = 0;
aFile->Open("r");
aFormat= seqFileFormatFp( aFile->fFile, &skiplines, &err);
aFile->Close();
return aFormat;
}
short DSeqFile::ReadSeqFile( DFile* aFile, DSeqList* aSeqList)
{
char seqid[256], *seqidptr, *bases, *seqlist;
long skiplines = 0, skipi, seqlen;
short aFormat, atseq = 0, whichSeq = 0, nseq = 0, err = 0;
char *defname;
Boolean notdone = true, needrewind = false;
DSequence* aSeq;
aFile->OpenFile("r");
aFormat = seqFileFormatFp( aFile->fFile, &skiplines, &err);
aFile->OpenFile(); // rewind
defname= FixSeqID( gDefSeqName, 0);
switch (aFormat) {
case kUnknown:
case kNoformat:
aFile->CloseFile();
return aFormat;
// all multi-pass (interleaved) formats go here !
case kOlsen:
case kMSF:
case kPAUP:
case kPhylip2:
case kPhylip4:
needrewind = true;
aFile->CloseFile();
seqlist = listSeqs( aFile->GetName(), skiplines, aFormat, &nseq, &err);
MemFree(seqlist);
break;
}
skipi= skiplines;
while (notdone) {
// this one-pass thru gFileRef may well be buggy for current readSeq
// need to rewind file & count whichSeq++ for each sequence...?
if (needrewind) {
whichSeq++;
atseq= 0;
aFile->OpenFile("r");
skipi= skiplines;
}
else {
whichSeq = 1;
atseq= 0;
}
seqlen = 0; seqidptr= seqid; *seqidptr= 0;
bases = readSeqFp( whichSeq, aFile->fFile, skipi, aFormat, &seqlen,
&atseq, &err, seqidptr);
skipi= 0;
// install bases into new TSequence, and stick TSequence into list
if (bases) {
if (seqlen > 0) {
char *name, * cp;
Boolean ismask= false;
if (*seqidptr != 0) {
if ( (cp= StrStr(seqidptr, kMaskName)) != NULL) {
ismask= true;
*cp= 0;
}
name= FixSeqID( seqidptr, 0);
}
else
name= defname;
if (ismask) {
aSeq= aSeqList->FindNamedSeq( name);
if (aSeq) {
// convert mask...
register char *mp;
for (mp= bases; *mp; mp++) {
*mp -= '?'; // to zero relative
*mp |= 0x80; // add high bit
}
aSeq->SetMasks( bases);
}
}
else {
aSeq = MakeSequence( name, bases, NULL, 0); //gFileRef->moddate
aSeqList->InsertLast(aSeq);
}
}
else notdone= false;
MemFree(bases); // readSeq made this ptr w/ malloc call... MakeSeq copies this data...
}
else notdone= false; // nodata -- if format=0, no data is read
if (needrewind) notdone &= (whichSeq < nseq); // stop now
else notdone &= !aFile->Eof();
}
aFile->CloseFile();
return aFormat;
}
void DSeqFile::WriteSeqHeader( DFile* aFile,
short format, short nseqs, long nbases, char* outputName,
Boolean& needSameSize, Boolean& isInterleaved)
{
const char *kASN1headline = "Bioseq-set ::= {\nseq-set {\n";
char line[256], name[64];
// this is a ureadseq.h call
// need to do somewhere once, then let user set prefs...
#define gPrettyInit1(p) { \
p.isactive=false;\
p.baseonlynum=true;\
p.numline= p.atseq= 0;\
p.numright= p.numtop= true; \
p.numleft= p.numbot= false;\
p.nameright= true; \
p.nameleft= p.nametop= false;\
p.noleaves= p.domatch= p.degap= false;\
p.matchchar='.';\
p.gapchar='-';\
p.namewidth=8;\
p.numwidth=5;\
p.interline=1;\
p.spacer=10;\
p.seqwidth=50;\
p.tab=0; }
gPrettyInit1(gPretty);
//gPretty.interline= 2; // not active, along w/ numtop,numbot,nametop...
// need to work into this class
needSameSize= false;
isInterleaved= false;
aFile->Open("a");
if (outputName) StrNCpy( name, outputName, 63);
else StrCpy( name, DFileManager::kUntitled);
switch (format) {
case kPIR:
aFile->WriteLine( "\\\\\\\n");
break;
case kASN1:
aFile->WriteLine( (char*)kASN1headline);
break;
case kMSF: // this is for top of interleaved file
{
isInterleaved= true;
unsigned long checksum = 0, checkall = 0;
//checksum= GCGchecksum(seq, seqlen, &checkall);
sprintf(line,"\n %s MSF: %d Type: N June 01, 1776 12:00 Check: %d ..\n\n",
name, nbases, checkall);
aFile->WriteLine( line);
}
break;
case kPhylip2:
isInterleaved= false;
goto allphyl;
break;
case kPhylip4:
isInterleaved= true;
allphyl:
{
// if (phylvers >= 4) fprintf(foo," %d %d\n", seqout, seqlen);
// else fprintf(foo," %d %d YF\n", seqout, seqlen);
if (isInterleaved) sprintf( line, " %d %d F\n", nseqs, nbases);
else sprintf( line, " %d %d FI \n", nseqs, nbases);
aFile->WriteLine( line);
needSameSize= true;
}
break;
case kPAUP:
isInterleaved= true;
needSameSize= true;
aFile->WriteLine( "#NEXUS\n");
sprintf( line,"[%s -- data title]\n\n", name);
aFile->WriteLine( line);
break;
case kPretty:
isInterleaved= true;
gPretty.isactive= true;
break;
default:
break;
}
aFile->CloseFile();
}
void DSeqFile::DoInterleave( DFile* toFile, DFile* fromFile,
short outform, short seqtype, short nseqs, long nbases,
short nlines, short noutindex, long* outindex)
{
short leaf, iline, iseq;
char *cp, line[256];
toFile->Open("a");
fromFile->Open("r");
for (leaf=0; leaf<nlines; leaf++) {
if (outform == kMSF && leaf == 1) { // do after writing seq names
toFile->WriteLine( "//\n\n");
}
if (outform == kPAUP && leaf==1) { // do after writing seq names
switch (seqtype) {
case DSequence::kDNA : cp= "dna"; break;
case DSequence::kRNA : cp= "rna"; break;
case DSequence::kNucleic : cp= "dna"; break;
case DSequence::kAmino : cp= "protein"; break;
case DSequence::kOtherSeq: cp= "dna"; break;
}
toFile->WriteLine( "\nbegin data;\n");
sprintf(line," dimensions ntax=%d nchar=%d;\n", nseqs, nbases);
toFile->WriteLine( line);
sprintf(line," format datatype=%s interleave missing=%c", cp, gPretty.gapchar); // gapchar == indelHard ???
toFile->WriteLine( line);
//if (gPretty.domatch) { sprintf(line," matchchar=%c", gPretty.matchchar);
//Mwrite( toFile, line); }
toFile->WriteLine( ";\n matrix\n");
}
for (iseq=0; iseq<noutindex; iseq++) {
fromFile->SetDataMark( outindex[iseq]);
for (iline=0; iline<=leaf; iline++)
if (fromFile->ReadLine( line, 256) != 0) *line= 0;
if (fromFile->Tell() <= outindex[iseq+1])
toFile->WriteLine( line, false); //remember- ReadLine a.k.a fgets retains newline
}
for (iline=0; iline<gPretty.interline; iline++)
toFile->WriteLine("\n"); //LineEnd
}
fromFile->Close();
toFile->Close();
}
void DSeqFile::WriteSeqTrailer( DFile* aFile,
short format, short /*nseqs*/, short /*nlinesperseq*/, long /*nbases*/)
{
switch (format) {
case kASN1:
aFile->Open("a");
aFile->WriteLine("} }\n");
aFile->Close();
break;
case kPAUP:
aFile->Open("a");
aFile->WriteLine(";\n end;\n");
aFile->Close();
break;
}
}
void DSeqFile::WriteSeqWrapper( DFile* aFile,
char *seq, long seqlen, short outform, char *seqid)
{
if (!aFile->IsOpen()) aFile->Open("a");
seqid= StrDup(seqid);
char* cp= FixSeqID(seqid, 0);
gLinesPerSeqWritten= writeSeq( aFile->fFile, seq, seqlen, outform, cp);
MemFree(seqid);
//aFile->Close(); //?? want left open for appending by caller ? e.g., seqmail
}
void DSeqFile::WriteMaskWrapper( DFile* aFile,
char *mask, long seqlen, short outform, char *seqid)
{
char maskid[128];
if (!aFile->IsOpen()) aFile->Open("a");
seqid= StrDup(seqid);
char* cp= FixSeqID(seqid, 0);
StrNCpy( maskid, cp, sizeof(maskid)-5);
StrNCat( maskid, kMaskName, sizeof(maskid));
MemFree(seqid);
// massage mask into writable form
mask= StrDup(mask);
Boolean hasdata= false;
for (cp= mask; *cp; cp++) {
*cp &= 0x7F; // drop hi bit
if (*cp != 0) hasdata= true;
*cp += '?'; // set 0 == '?', mask range is mostly 'A-Za-z'
}
if (hasdata)
gLinesPerSeqWritten= writeSeq( aFile->fFile, mask, seqlen, outform, maskid);
MemFree(mask);
}